In this section, use appropriate visualization techniques to explore the dataset and identify any patterns or relationships in the data.
# Convert discrete variables to factors
dat$gender <- as.factor(dat$gender)
dat$race <- as.factor(dat$race)
dat$smoking <- as.factor(dat$smoking)
dat$hypertension <- as.factor(dat$hypertension)
dat$diabetes <- as.factor(dat$diabetes)
dat$vaccine <- as.factor(dat$vaccine)
dat$severity <- as.factor(dat$severity)
dat$study <- as.factor(dat$study)
dat <- dat %>%
select(-id) %>%
mutate(
gender = case_when(
dat$gender == 1 ~ "Male",
TRUE ~ "Female"),
race = case_when(
dat$race == 1 ~ "White",
dat$race == 2 ~ "Asian",
dat$race == 3 ~ "Black",
TRUE ~ "Hispanic"),
smoking = case_when(
dat$smoking == 0 ~ "Never Smoked",
dat$smoking == 1 ~ "Former Smoker",
TRUE ~ "Current Smoker")) %>%
mutate_if(is.character, as.factor) %>%
mutate(race = relevel(race, ref = "White"),
smoking = relevel(smoking, ref = "Never Smoked"),
study = relevel(study, ref = "B"))
# Continuous variables
continuous_vars <- c("age", "height", "weight", "bmi", "SBP", "LDL")
for (var in continuous_vars) {
plot <- ggplot(dat, aes_string(x = var, y = "recovery_time")) +
geom_point() +
geom_smooth() +
labs(x = var, y = "Time to Recovery (days)", title = paste(var, "vs. Time to Recovery")) +
theme_classic()
print(plot)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Discrete variables
discrete_vars <- c("gender", "race", "smoking", "hypertension", "diabetes", "vaccine", "severity", "study")
for (var in discrete_vars) {
plot <- ggplot(dat, aes_string(x = var, y = "recovery_time")) +
geom_boxplot() +
labs(x = var, y = "Time to Recovery (days)", title = paste(var, "vs. Time to Recovery")) +
theme_classic()
print(plot)
}
dat_continuous <- dat %>%
select(c(age, height, weight, bmi, SBP, LDL))
corrplot(cor(dat_continuous), method = 'number', type = 'lower')
# Variables to include in subset: age, height, weight, gender, race, smoking, hypertension, diabetes, vaccine, severity, study
# dat_subset <- dat %>%
# select(c(age, bmi, gender, race, smoking, hypertension, diabetes, vaccine, severity, study, recovery_time))
# dat_subset <- dat %>%
# select(c(age, height, weight, gender, race, smoking, hypertension, diabetes, vaccine, severity, study, recovery_time))
dat_subset <- dat %>%
select(c(height, weight, vaccine, severity, study, recovery_time))
# dat %>% ggplot(aes(x = age, y = recovery_time)) +
# geom_jitter() + geom_smooth() + theme_classic()
# Full Data
set.seed(1)
trainIndex <- createDataPartition(dat$recovery_time, p = 0.8, list = FALSE)
train_data <- dat[trainIndex, ]
test_data <- dat[-trainIndex, ]
# Subset of Data
set.seed(1)
trainIndex_sub <- createDataPartition(dat_subset$recovery_time, p = 0.8, list = FALSE)
train_data_sub <- dat_subset[trainIndex, ]
test_data_sub <- dat_subset[-trainIndex, ]
ctrl_SE <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
selectionFunction = "oneSE")
ctrl_best <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
selectionFunction = "best")
x <- model.matrix(recovery_time ~ ., train_data)[, -1]
y <- train_data$recovery_time
x_sub <- model.matrix(recovery_time ~ ., train_data_sub)[, -1]
y_sub <- train_data_sub$recovery_time
set.seed(1)
# LASSO model Full
lasso.fit <- train(
x = x,
y = y,
data = train_data,
method = "glmnet",
trControl = ctrl_SE,
tuneGrid = expand.grid(alpha = 1,
lambda = exp(seq(-6, -1, length = 100))),
standardize = T
)
ggplot(lasso.fit, highlight = TRUE) +
theme_classic()
best_lambda <- lasso.fit$bestTune$lambda
coef(lasso.fit$finalModel, lasso.fit$bestTune$lambda)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -840.04385489
## age 0.16823163
## genderMale -2.59075696
## raceAsian 3.50404233
## raceBlack .
## raceHispanic -0.38081627
## smokingCurrent Smoker 2.97646686
## smokingFormer Smoker 1.72027649
## height 4.86538187
## weight -5.41777863
## bmi 17.43538534
## hypertension1 1.69648547
## diabetes1 -1.46759957
## SBP 0.02233705
## LDL -0.02000477
## vaccine1 -6.58454252
## severity1 6.81026029
## studyA -4.75766922
set.seed(1)
# LASSO model Subset
lasso.fit.sub <- train(
x = x_sub,
y = y_sub,
data = train_data_sub,
method = "glmnet",
trControl = ctrl_SE,
tuneGrid = expand.grid(alpha = 1,
lambda = exp(seq(-6, 2, length = 100))),
standardize = T
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
ggplot(lasso.fit.sub, highlight = TRUE) +
theme_classic()
best_lambda <- lasso.fit.sub$bestTune$lambda
coef(lasso.fit.sub$finalModel, lasso.fit.sub$bestTune$lambda)
## 6 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 88.9382606
## height -0.3782789
## weight 0.2434927
## vaccine1 -2.5641618
## severity1 0.7450711
## studyA -0.5830858
set.seed(1)
# Elastic Net Full
enet.fit <- train(x = x,
y = y,
data = train_data,
method = "glmnet",
tuneGrid = expand.grid(alpha = seq(0, 1, length = 21),
lambda = exp(seq(-5, 3, length = 100))),
trControl = ctrl_best,
standardize = T)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
enet.fit$bestTune
## alpha lambda
## 1501 0.75 0.006737947
myCol <- rainbow(25)
myPar <- list(superpose.symbol = list(col = myCol),
superpose.line = list(col = myCol))
plot(enet.fit, par.settings = myPar)
coef(enet.fit$finalModel, enet.fit$bestTune$lambda)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -2.002609e+03
## age 1.790879e-01
## genderMale -2.947692e+00
## raceAsian 3.768215e+00
## raceBlack -4.163253e-01
## raceHispanic -7.390642e-01
## smokingCurrent Smoker 3.556445e+00
## smokingFormer Smoker 2.179138e+00
## height 1.172685e+01
## weight -1.267781e+01
## bmi 3.825160e+01
## hypertension1 1.903922e+00
## diabetes1 -1.662082e+00
## SBP 2.140266e-02
## LDL -2.939422e-02
## vaccine1 -6.793896e+00
## severity1 7.273653e+00
## studyA -4.878764e+00
set.seed(1)
# Elastic Net Subset
enet.fit.sub <- train(x = x_sub,
y = y_sub,
data = train_data_sub,
method = "glmnet",
tuneGrid = expand.grid(alpha = seq(0, 1, length = 21),
lambda = exp(seq(-5, 5, length = 100))),
trControl = ctrl_best,
standardize = T)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
enet.fit.sub$bestTune
## alpha lambda
## 55 0 1.575457
myCol <- rainbow(25)
myPar <- list(superpose.symbol = list(col = myCol),
superpose.line = list(col = myCol))
plot(enet.fit.sub, par.settings = myPar)
coef(enet.fit.sub$finalModel, enet.fit.sub$bestTune$lambda)
## 6 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 133.1787123
## height -0.7679694
## weight 0.5738241
## vaccine1 -6.4862640
## severity1 7.1420582
## studyA -4.8267005
set.seed(1)
# Ridge Full
ridge.fit <- train(x = x,
y = y,
data = train_data,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0,
lambda = exp(seq(10, -6, length=100))),
trControl = ctrl_best)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
plot(ridge.fit, xTrans = log)
ridge.fit$bestTune
## alpha lambda
## 34 0 0.5134171
coef(ridge.fit$finalModel, s = ridge.fit$bestTune$lambda)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -101.92788195
## age 0.19639986
## genderMale -2.71878900
## raceAsian 4.01116555
## raceBlack -0.15248392
## raceHispanic -0.98352878
## smokingCurrent Smoker 3.36685579
## smokingFormer Smoker 1.91249121
## height 0.49957702
## weight -0.78408717
## bmi 4.11762834
## hypertension1 1.68868623
## diabetes1 -1.84842122
## SBP 0.04152787
## LDL -0.02903930
## vaccine1 -6.71601476
## severity1 6.91370346
## studyA -4.97385571
set.seed(1)
# Ridge Subset
ridge.fit.sub <- train(x = x_sub,
y = y_sub,
data = train_data_sub,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0,
lambda = exp(seq(10, -6, length=100))),
trControl = ctrl_best)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
plot(ridge.fit.sub, xTrans = log)
ridge.fit.sub$bestTune
## alpha lambda
## 41 0 1.591451
coef(ridge.fit.sub$finalModel, s = ridge.fit.sub$bestTune$lambda)
## 6 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 133.1115977
## height -0.7673816
## weight 0.5733665
## vaccine1 -6.4823273
## severity1 7.1377216
## studyA -4.8237001
set.seed(1)
# Partial Least Squares
pls.fit <- train(x = x,
y = y,
method = "pls",
tuneGrid = data.frame(ncomp = 1:15),
trControl = ctrl_best,
preProcess = c("center", "scale"))
ggplot(pls.fit, highlight = TRUE) + theme_classic()
set.seed(1)
# Partial Least Squares
pls.fit.sub <- train(x = x_sub,
y = y_sub,
method = "pls",
#tuneGrid = data.frame(ncomp = 1:12),
tuneGrid = data.frame(ncomp = 1:6),
trControl = ctrl_best,
preProcess = c("center", "scale"))
ggplot(pls.fit.sub, highlight = TRUE) + theme_classic()
mars_grid <- expand.grid(degree = 1:3,
nprune = 2:15)
set.seed(1)
# MARS Full
mars.fit <- train(x = x,
y = y,
method = "earth",
tuneGrid = mars_grid,
metric = "RMSE",
trControl = ctrl_best)
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
##
## Attaching package: 'plotrix'
## The following object is masked from 'package:scales':
##
## rescale
## Loading required package: TeachingDemos
ggplot(mars.fit) + theme_classic()
mars.fit$bestTune
## nprune degree
## 16 3 2
coef(mars.fit$finalModel)
## (Intercept) h(bmi-30.7) h(bmi-30.7) * studyA
## 38.85227 27.39024 -20.91526
set.seed(1)
# MARS Sub
mars.fit.sub <- train(x = x_sub,
y = y_sub,
method = "earth",
tuneGrid = mars_grid,
metric = "RMSE",
trControl = ctrl_best)
ggplot(mars.fit.sub) + theme_classic()
mars.fit.sub$bestTune
## nprune degree
## 41 14 3
coef(mars.fit.sub$finalModel)
## (Intercept)
## 36.54169610
## h(height-159.6)
## -0.45993611
## h(159.6-height)
## 10.10337216
## h(159.6-height) * h(weight-81.3)
## 6.27108983
## h(159.6-height) * h(81.3-weight)
## -0.21300902
## h(159.6-height) * h(weight-81.3) * studyA
## -5.89235302
## h(weight-77.8)
## 1.08027270
## h(171.6-height) * h(weight-77.8)
## 0.53281219
## vaccine1
## -6.58139206
## h(171.6-height) * h(weight-77.8) * studyA
## -0.40470585
## severity1
## 7.48014816
## h(height-159.6) * h(87.3-weight)
## 0.07525588
## h(159.6-height) * studyA
## -6.23583295
resamples <- resamples(list(lasso = lasso.fit,
enet = enet.fit,
ridge = ridge.fit,
pls = pls.fit,
mars = mars.fit,
lasso.sub = lasso.fit.sub,
enet.sub = enet.fit.sub,
ridge.sub = ridge.fit.sub,
pls.sub = pls.fit.sub,
mars.sub = mars.fit.sub))
bwplot(resamples, metric = "RMSE")
# png("plot_rmse.png", width = 800, height = 600, res = 150) # Open the PNG graphics device
# bwplot(resamples, metric = "RMSE") # Create your plot
# dev.off()
# Prepare the test data for predictions
x_test <- model.matrix(recovery_time ~ ., test_data)[, -1]
x_test_sub <- model.matrix(recovery_time ~ ., test_data_sub)[, -1]
# Create a tibble to store the model names and test RMSE values
test_RMSE <- tibble(
Model = c("LASSO", "Elastic Net", "Ridge", "PLS", "MARS",
"LASSO (Subset)", "Elastic Net (Subset)", "Ridge (Subset)", "PLS (Subset)", "MARS (Subset)"),
RMSE = c(
postResample(predict(lasso.fit, newdata = x_test), test_data$recovery_time)[1],
postResample(predict(enet.fit, newdata = x_test), test_data$recovery_time)[1],
postResample(predict(ridge.fit, newdata = x_test), test_data$recovery_time)[1],
postResample(predict(pls.fit, newdata = x_test), test_data$recovery_time)[1],
postResample(predict(mars.fit, newdata = x_test), test_data$recovery_time)[1],
postResample(predict(lasso.fit.sub, newdata = x_test_sub), test_data_sub$recovery_time)[1],
postResample(predict(enet.fit.sub, newdata = x_test_sub), test_data_sub$recovery_time)[1],
postResample(predict(ridge.fit.sub, newdata = x_test_sub), test_data_sub$recovery_time)[1],
postResample(predict(pls.fit.sub, newdata = x_test_sub), test_data_sub$recovery_time)[1],
postResample(predict(mars.fit.sub, newdata = x_test_sub), test_data_sub$recovery_time)[1]
)
)
test_RMSE %>% arrange(RMSE)
## # A tibble: 10 × 2
## Model RMSE
## <chr> <dbl>
## 1 MARS (Subset) 17.8
## 2 MARS 17.9
## 3 PLS 18.4
## 4 Elastic Net 18.4
## 5 LASSO 19.1
## 6 Ridge 19.8
## 7 Ridge (Subset) 20.4
## 8 Elastic Net (Subset) 20.4
## 9 PLS (Subset) 20.4
## 10 LASSO (Subset) 20.8